<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <title>
   ML/NLP入门教程Python版（第一课）：文本处理  | 数螺 | NAUT IDEA
  </title>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap-theme.min.css" rel="stylesheet"/>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet"/>
  <style type="text/css">
   #xmain img {
                  max-width: 100%;
                  display: block;
                  margin-top: 10px;
                  margin-bottom: 10px;
                }

                #xmain p {
                    line-height:150%;
                    font-size: 16px;
                    margin-top: 20px;
                }

                #xmain h2 {
                    font-size: 24px;
                }

                #xmain h3 {
                    font-size: 20px;
                }

                #xmain h4 {
                    font-size: 18px;
                }


                .header {
	           background-color: #0099ff;
	           color: #ffffff;
	           margin-bottom: 20px;
	        }

	        .header p {
                  margin: 0px;
                  padding: 10px 0;
                  display: inline-block;  
                  vertical-align: middle;
                  font-size: 16px;
               }

               .header a {
                 color: white;
               }

              .header img {
                 height: 25px;
              }
  </style>
  <script src="http://cdn.bootcss.com/jquery/3.0.0/jquery.min.js">
  </script>
  <script src="http://nautstatic-10007657.file.myqcloud.com/static/css/readability.min.js" type="text/javascript">
  </script>
  <script type="text/javascript">
   $(document).ready(function() {
                 var loc = document.location;
                 var uri = {
                  spec: "http://dataunion.org/17327.html",
                  host: "http://dataunion.org",
                  prePath: "http://dataunion.org",
                  scheme: "http",
                  pathBase: "http://dataunion.org/"
                 };
    
                 var documentClone = document.cloneNode(true);
                 var article = new Readability(uri, documentClone).parse();
     
                 document.getElementById("xmain").innerHTML = article.content;
                });
  </script>
  <!-- 1466459614: Accept with keywords: (title(0.285714285714):社区,Python,版,入门教程,数盟,文本处理,第一课, topn(0.466666666667):词性,数盟,图表,语料,文件,Python,词根,词频,文章,文档,方法,字典,特征值,词条,词形,python,语料库,计数,算法,标点符号,词干,对数,词项,词汇,部分,代码,单词,课程,频率,文本).-->
 </head>
 <body onload="">
  <div class="header">
   <div class="container">
    <div class="row">
     <div class="col-xs-6 col-sm-6 text-left">
      <a href="/databee">
       <img src="http://nautidea-10007657.cos.myqcloud.com/logo_white.png"/>
      </a>
      <a href="/databee">
       <p>
        数螺
       </p>
      </a>
     </div>
     <div class="hidden-xs col-sm-6 text-right">
      <p>
       致力于数据科学的推广和知识传播
      </p>
     </div>
    </div>
   </div>
  </div>
  <div class="container text-center">
   <h1>
    ML/NLP入门教程Python版（第一课）：文本处理
   </h1>
  </div>
  <div class="container" id="xmain">
   ﻿﻿
   <title>
    ML/NLP入门教程Python版（第一课）：文本处理 | 数盟社区
   </title>
   <!-- All in One SEO Pack 2.2.7.6.2 by Michael Torbert of Semper Fi Web Design[32,74] -->
   <!-- /all in one seo pack -->
   <!--
<div align="center">
<a href="http://strata.oreilly.com.cn/hadoop-big-data-cn?cmp=mp-data-confreg-home-stcn16_dataunion_pc" target="_blank"><img src="http://dataunion.org/wp-content/uploads/2016/05/stratabj.jpg"/ ></a>
</div>
-->
   <header id="header-web">
    <div class="header-main">
     <hgroup class="logo">
      <h1>
       <a href="http://dataunion.org/" rel="home" title="数盟社区">
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/logo.png"/>
       </a>
      </h1>
     </hgroup>
     <!--logo-->
     <nav class="header-nav">
      <ul class="menu" id="menu-%e4%b8%bb%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-71" id="menu-item-71">
        <a href="http://dataunion.org/category/events" title="events">
         活动
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22457" id="menu-item-22457">
          <a href="http://dataunion.org/2016timeline">
           2016档期
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22459" id="menu-item-22459">
          <a href="http://dataunion.org/category/parterc">
           合作会议
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-has-children menu-item-20869" id="menu-item-20869">
        <a href="http://dataunion.org/category/tech" title="articles">
         文章
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20867" id="menu-item-20867">
          <a href="http://dataunion.org/category/tech/base" title="base">
           基础架构
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3302" id="menu-item-3302">
          <a href="http://dataunion.org/category/tech/ai" title="ai">
           人工智能
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3303" id="menu-item-3303">
          <a href="http://dataunion.org/category/tech/analysis" title="analysis">
           数据分析
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21920" id="menu-item-21920">
          <a href="http://dataunion.org/category/tech/dm">
           数据挖掘
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3314" id="menu-item-3314">
          <a href="http://dataunion.org/category/tech/viz" title="viz">
           可视化
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3305" id="menu-item-3305">
          <a href="http://dataunion.org/category/tech/devl" title="devl">
           编程语言
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-20876" id="menu-item-20876">
        <a href="http://dataunion.org/category/industry">
         行业
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-16328" id="menu-item-16328">
          <a href="http://dataunion.org/category/industry/case" title="case">
           行业应用
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-2112" id="menu-item-2112">
          <a href="http://dataunion.org/category/industry/demo" title="demo">
           Demo展示
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21562" id="menu-item-21562">
          <a href="http://dataunion.org/category/industry/news">
           行业资讯
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-311" id="menu-item-311">
        <a href="http://dataunion.org/category/sources" title="sources">
         资源
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20870" id="menu-item-20870">
        <a href="http://dataunion.org/category/books" title="book">
         图书
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21363" id="menu-item-21363">
        <a href="http://dataunion.org/category/training">
         课程
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-21853" id="menu-item-21853">
        <a href="http://dataunion.org/category/jobs">
         职位
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22050" id="menu-item-22050">
          <a href="http://dataunion.org/category/career">
           职业规划
          </a>
         </li>
        </ul>
       </li>
      </ul>
     </nav>
     <!--header-nav-->
    </div>
   </header>
   <!--header-web-->
   <div id="main">
    <div id="soutab">
     <form action="http://dataunion.org/" class="search" method="get">
     </form>
    </div>
    <div id="container">
     <nav id="mbx">
      当前位置：
      <a href="http://dataunion.org">
       首页
      </a>
      &gt;
      <a href="http://dataunion.org/category/tech">
       文章
      </a>
      &gt;  正文
     </nav>
     <!--mbx-->
     <article class="content">
      <header align="centre" class="contenttitle">
       <div class="mscc">
        <h1 class="mscctitle">
         <a href="http://dataunion.org/17327.html">
          ML/NLP入门教程Python版（第一课）：文本处理
         </a>
        </h1>
        <address class="msccaddress ">
         <em>
          2,007 次阅读 -
         </em>
         <a href="http://dataunion.org/category/tech" rel="category tag">
          文章
         </a>
        </address>
       </div>
      </header>
      <div class="content-text">
       <p>
        翻译：
        <a href="http://python.jobbole.com/author/boisde/">
         boisde
        </a>
        出处：
        <a href="http://python.jobbole.com/81397/" target="_blank">
         伯乐在线
        </a>
       </p>
       <p>
        第一部分：文本处理
       </p>
       <p>
        欢迎来到机器学习和自然语言处理原型编码教程系列的第一部分。 Thoughtly正在制作一个着重于理解机器学习基础的系列教程,着重关注于在自然语言处理中的应用。
       </p>
       <p>
        <img src="http://dataunion.org/wp-content/uploads/2015/05/6941baebgw1ervkgkkrdxj20kk07bjt8.jpg"/>
       </p>
       <p>
        这一系列教程的目标是提供有据可查的可用代码，附加留言部分的深入探讨。代码将被放到GitHub上，在一个开放的许可证下，允许你任意修改或使用——不必署名（注明来源）。这里的代码为了明白起见以牺牲性能为代价写的比较冗长。如果你有大量的数据要处理，这些工具的可扩展性很可能无法达到完成你目的的要求。幸运的是，我们正在计划通过研究此处讨论的算法在当下最新的实现，来更好地对这个系列进行深入探索。这些内容都是黑盒子，是我们在初始系列中有意避免（到实用的程度）提到的内容。我们相信，在能使用这些黑盒子之前，在机器学习方面打下一个坚实的基础是至关重要的。
       </p>
       <p>
        第一部分的重点是如何从文本语料库提取出信息来。我们有意用介绍性的水平来开始教程，但是它涉及到很多不同的技巧和测量标准，这些方法都会在之后应用到更深入的机器学习任务上。
       </p>
       <h5>
        文本提取
       </h5>
       <p>
        下文介绍的以及此处代码中用到的工具，都假设我们将所选的语料当作一袋单词。这是你在处理文本文档的时候常常会看到的一个基本概念。将语料当作一袋单词是将文档向量化中的一个典型步骤，以供机器学习算法进一步处理。把文档转换成可处理向量通常还需要采取一些额外步骤，我们将在后面的课程中对此进行讨论。本课程中介绍的概念和工具将作为后面工具的构建模块。也许更重要的是，这些工具可以帮助你通过快速检查一个文本语料库，从而对它所包含的内容有一个基本的了解。
       </p>
       <p>
        本课程中我们所研究的代码及示例都是使用Python实现的。这些代码能够从NLTK(Python的自然语言工具包)所提供的不同的文本语料库中提取数据。这是个包括了ABC新闻的文字、圣经的创世纪、从古滕堡计划中选取的部分文本、总统就职演说、国情咨文和从网络上截取的部分文本所组成的语料库。另外，用户还能从他们自己提供的语料库来提取文本。从NLTK导入的代码并不是特别有趣，但我们想指出的是，要从NLTK文本语料库中提取数据是非常简单方便的。
       </p>
       <blockquote>
        <div class="line number1 index0 alt2">
         <code class="python keyword">
          def
         </code>
         <code class="python plain">
          load_text_corpus(args):
         </code>
        </div>
        <div class="line number2 index1 alt1">
        </div>
        <div class="line number3 index2 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          if
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "abc"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number4 index3 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the ABC corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number5 index4 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "ABC"
         </code>
        </div>
        <div class="line number6 index5 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.abc.words()
         </code>
        </div>
        <div class="line number7 index6 alt2">
        </div>
        <div class="line number8 index7 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "genesis"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number9 index8 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the ABC corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number10 index9 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Genesis"
         </code>
        </div>
        <div class="line number11 index10 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.genesis.words()
         </code>
        </div>
        <div class="line number12 index11 alt1">
        </div>
        <div class="line number13 index12 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "gutenberg"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number14 index13 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the Gutenberg corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number15 index14 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Gutenberg"
         </code>
        </div>
        <div class="line number16 index15 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.gutenberg.words()
         </code>
        </div>
        <div class="line number17 index16 alt2">
        </div>
        <div class="line number18 index17 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "inaugural"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number19 index18 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the Inaugural Address corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number20 index19 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Inaugural"
         </code>
        </div>
        <div class="line number21 index20 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.inaugural.words()
         </code>
        </div>
        <div class="line number22 index21 alt1">
        </div>
        <div class="line number23 index22 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "stateUnion"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number24 index23 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the State of the Union corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number25 index24 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Union"
         </code>
        </div>
        <div class="line number26 index25 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.state_union.words()
         </code>
        </div>
        <div class="line number27 index26 alt2">
        </div>
        <div class="line number28 index27 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "webtext"
         </code>
         <code class="python plain">
          ]:
         </code>
        </div>
        <div class="line number29 index28 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading the webtext corpus."
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number30 index29 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Web"
         </code>
        </div>
        <div class="line number31 index30 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          nltk.corpus.webtext.words()
         </code>
        </div>
        <div class="line number32 index31 alt1">
        </div>
        <div class="line number33 index32 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          elif
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "custom"
         </code>
         <code class="python plain">
          ] !
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python color1">
          None
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number34 index33 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Loading a custom corpus from "
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          args[
         </code>
         <code class="python string">
          "custom"
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number35 index34 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "Custom"
         </code>
        </div>
        <div class="line number36 index35 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          load_custom_corpus(args[
         </code>
         <code class="python string">
          "custom"
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number37 index36 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          else
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number38 index37 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          words
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          ""
         </code>
        </div>
        <div class="line number39 index38 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          name
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python string">
          "None"
         </code>
        </div>
        <div class="line number40 index39 alt1">
        </div>
        <div class="line number41 index40 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          logging.debug(
         </code>
         <code class="python string">
          "Read "
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python functions">
          str
         </code>
         <code class="python plain">
          (
         </code>
         <code class="python functions">
          len
         </code>
         <code class="python plain">
          (words))
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python string">
          " words: "
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python functions">
          str
         </code>
         <code class="python plain">
          (words[
         </code>
         <code class="python value">
          0
         </code>
         <code class="python plain">
          :
         </code>
         <code class="python value">
          20
         </code>
         <code class="python plain">
          ]))
         </code>
        </div>
        <div class="line number42 index41 alt1">
        </div>
        <div class="line number43 index42 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          return
         </code>
         <code class="python plain">
          words, name
         </code>
        </div>
       </blockquote>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_42635">
        </div>
       </div>
       <p>
        上面的大部分代码只是日志。有意思的部分在357行、362行、367行等。基于用户选择，每部分加载不同的语料库。 NLTK对从现有语料库中提取文本提供了一些非常便利的方法。这包括一些简单的、纯文本的语料库，也包括一些已经用各种方式被标记过的语料库 —— 语料库中的每个文档可能被标记过类别或是语料库中有的语音已被加过标签，如此等等。在本课程中，我们对NLTK的使用仅限于语料库的导入、词汇的切分，以及我们下面将讨论两个操作，词根和词形还原。虽然不会总是如此，但现在为止足够我们需要的所有功能。值得注意的是，您还可以在脚本中使用-custom参数导入自定义语料库。这应该是含有.txt文件的文件夹。该文件夹是递归读入的，所以含有.txt文件的子文件夹也能被处理。
       </p>
       <h5>
        词汇切分
       </h5>
       <p>
        词汇切分是切分语料库，使之变成各个独立部分——通常指单词，的行为。我们这样做是因为大多数ML算法无法处理任意长的文本字符串。相反，他们会假设你已经分割你的语料库为单独的，算法可处理的词块（token）。虽然我们将在后面的课程详细讨论这个话题，算法不一定限于一次只处理一个词块（token）。事实上，许多算法只在处理短序列（n-grams）时有用。本课程中我们将情况限定于一序列（1-grams），或者叫，单序列（unigram）。
       </p>
       <p>
        对文本语料库做词汇切分的最简单的方法就是仅基于空白字符。这种方法确实非常简单，但它也有缺点。例如，它会导致位于句尾的文本包含有句尾标点符号，而一般不需要这样。在另一方面，类似can’t和e.g.这样带有词内标点的单词就没法被正确提取出来了。我们可以添加一步操作来删除所有非字母数字的字符。这将解决句尾标点符号的问题，同时也能将can’t和e.g.这样的单词提取出来，尽管是以丢掉了他们的标点符号的方式被提取出来的。然而，这也引入了一个新的问题。对于某些应用，我们还是希望保留标点符号。在创建语言模型的时候，句尾标点能区分一个单词是否是结尾单词，从这方面来说，额外的标点信息是有价值的。
       </p>
       <p>
        对于这个任务，我们要将一些标点符号（句号）作为一个词，使用NLTK word_tokenizer（它是基于TreebankWordTokenizer来实现的）来做词汇切分。这个分词器有很多针对各式各样的词汇做切分的规则。举例来说，“can’t”这样的缩写实际上被分成了两个词(token) – ca和n’t。有趣的是，这意味着我们最后会得到ca这样的词，它理想地匹配了can（在某些任务中）。这样的错误匹配是这种符号化算法带来的不幸后果。NLTK支持多种分词器。这是一个及其冗长的文件，
        <a href="http://www.nltk.org/api/nltk.tokenize.html">
         http://www.nltk.org/api/nltk.tokenize.html
        </a>
        ，但在里面可以找到它所支持的分词器的细节。
       </p>
       <h5>
        词干提取和词形还原
       </h5>
       <p>
        一旦取到了文字我们就可以开始处理它。脚本提供了许多简单的工具，它们会帮助我们查看我们所选择的内容。之后我们会深入谈到这些工具。首先，让我们思考一下该用什么方法来操作我们取到的文本。通常我们需要为ML算法提供从语料库提取的原始文本词汇（单词）。在其他情况下，将这些单词转成原始内容的各种变形也是有道理的。
       </p>
       <p>
        具体来说，我们经常要将原始单词截断到它的词根。那么，什么是一个词根呢？英语单词有从原始单词延伸出的通用后缀。就拿单词”run”为例。有很多的扩展它的词 – “runner”，”runs”，”running”等，即对基本定义的进一步阐述。词干提取是从”runner”,”runs”以及“running”中去除所有和”run”不一致的部分的过程。请注意，在上述列表中不包含”ran” —— 后面我们再对此进行阐述。下面是一个被提取词干的句子的具体实例。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a17017075799" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          stem(Jim is running to work.) =&amp;gt; Jim is run to work.
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a17017075799-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a17017075799-1">
               <span class="crayon-e">
                stem
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-e">
                Jim
               </span>
               <span class="crayon-st">
                is
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                running
               </span>
               <span class="crayon-st">
                to
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                work
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =&amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Jim
               </span>
               <span class="crayon-st">
                is
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                run
               </span>
               <span class="crayon-st">
                to
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                work
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0012 seconds] -->
       <p>
        我们已经丢失了”吉姆在跑步”这个信息，尽管此处的上下文隐含的所有其他信息都说不通。我们不可能完全扭转这一点 —— 我们可以猜测那里曾经是什么词，但我们很可能会弄错。
       </p>
       <p>
        此处提供的代码可以让你对你的语料库进行词干提取。实际的词干提取是微不足道的，因为我们会使用NLTK来进行这部分工作。我们只需通过输入数组迭代，并返回使用NLTK Porter Stemmer所得到的各种提取后的词干变体。有许多不同的词干分析器可供选择，还包括非英语语言的选项。Porter Stemmer常用于英语。
       </p>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_832802">
         <table border="0" cellpadding="0" cellspacing="0">
          <tbody>
           <tr>
            <td class="gutter">
             <div class="line number1 index0 alt2">
              1
             </div>
             <div class="line number2 index1 alt1">
              2
             </div>
             <div class="line number3 index2 alt2">
              3
             </div>
             <div class="line number4 index3 alt1">
              4
             </div>
             <div class="line number5 index4 alt2">
              5
             </div>
             <div class="line number6 index5 alt1">
              6
             </div>
             <div class="line number7 index6 alt2">
              7
             </div>
            </td>
            <td class="code">
             <div class="container">
              <div class="line number1 index0 alt2">
               <code class="python keyword">
                def
               </code>
               <code class="python plain">
                stem_words_array(words_array):
               </code>
              </div>
              <div class="line number2 index1 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                stemmer
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                nltk.PorterStemmer();
               </code>
              </div>
              <div class="line number3 index2 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                stemmed_words_array
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                [];
               </code>
              </div>
              <div class="line number4 index3 alt1">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                for
               </code>
               <code class="python plain">
                word
               </code>
               <code class="python keyword">
                in
               </code>
               <code class="python plain">
                words_array:
               </code>
              </div>
              <div class="line number5 index4 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                stem
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                stemmer.stem(word);
               </code>
              </div>
              <div class="line number6 index5 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                stemmed_words_array.append(stem);
               </code>
              </div>
              <div class="line number7 index6 alt2">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                return
               </code>
               <code class="python plain">
                stemmed_words_array;
               </code>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
        词形还原类似于词干提取，但又有着重要的区别。与使用一系列简单的规则将一个单词截断成它的词根不同，词形还原尝试对输入的单词确定一个恰当的词根。本质上，词形还原试图找到一个单词的字典项，也称为单词的基本形(base term)。为了使这种查找能正确的工作，词形还原器必须知道您寻找的这个词在句子中的词性。生成语料库的词条与词干提取的代码基本上是相同的（尽管这段代码有上文略为提及的缺点，我们将在下面进一步对此进行讨论）。这里我们用了WordNetLemmatizer，它使用WordNet的数据库作为其查询指定词条的字典。
       </p>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_143349">
         <table border="0" cellpadding="0" cellspacing="0">
          <tbody>
           <tr>
            <td class="gutter">
             <div class="line number1 index0 alt2">
              1
             </div>
             <div class="line number2 index1 alt1">
              2
             </div>
             <div class="line number3 index2 alt2">
              3
             </div>
             <div class="line number4 index3 alt1">
              4
             </div>
             <div class="line number5 index4 alt2">
              5
             </div>
             <div class="line number6 index5 alt1">
              6
             </div>
             <div class="line number7 index6 alt2">
              7
             </div>
            </td>
            <td class="code">
             <div class="container">
              <div class="line number1 index0 alt2">
               <code class="python keyword">
                def
               </code>
               <code class="python plain">
                lemmatize_words_array(words_array):
               </code>
              </div>
              <div class="line number2 index1 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                lemmatizer
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                nltk.stem.WordNetLemmatizer()
               </code>
              </div>
              <div class="line number3 index2 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                lemmatized_words_array
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                [];
               </code>
              </div>
              <div class="line number4 index3 alt1">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                for
               </code>
               <code class="python plain">
                word
               </code>
               <code class="python keyword">
                in
               </code>
               <code class="python plain">
                words_array:
               </code>
              </div>
              <div class="line number5 index4 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                lemma
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                lemmatizer.lemmatize(word)
               </code>
              </div>
              <div class="line number6 index5 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                lemmatized_words_array.append(lemma)
               </code>
              </div>
              <div class="line number7 index6 alt2">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                return
               </code>
               <code class="python plain">
                lemmatized_words_array;
               </code>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
        正如上文所述，词形还原知道单词的词性。NLTK WordNetLemmatizer天真地假设，所有传入的单词都是名词。这种假设意味着你必须告诉词形还原器要传递的词不是一个名词，否则它会错误的地将其视为一个名词。这个行为，加上对未知的单词（特别是当它混在一段文本中的时候）不做任何处理直接输出的行为，使得词形还原器处理效果很差。举例来说，如果让词形还原器处理”ran”这个词，在不指出”ran”属于一段文本的情况下，它将直接输出”ran”。它不知道的作为名词的”ran”，因为很明显”ran”不是一个名词。但是，如果你正确地指出”ran”是动词,那么词形还原器就能输出”run”。与相对，此处词干分析器就会输出”ran”。因此，如果我们要有效地利用词形还原器，我们也必须付出在源代码中对词性进行标注的代价，我们将在后面的课程中对词性标注的部分进行讨论。标记单词词性的额外成本也是词形还原器不像词干分析器那样应用广泛的原因之一 —— 所添加的功能抵不上所花的成本。
       </p>
       <h5>
        词汇量
       </h5>
       <p>
        现在，使用词干提取或词形还原的方法，我们已经拉取了一个语料库并且(视情况)对它做了变形，终于可以开始查看它的内容了。下面不是一个详尽的清单，但作为审查文本的技术参考。有些是立刻会用到的，其他则会在以后讨论到。
       </p>
       <p>
        第一项测量是最简单的——词汇计数。这个指标是语料库内所有唯一字的计数。正如你所期望的，代码很容易实现。唯一一个你之后还会再遇到的技巧，是我们决定使用Python里dictionary的唯一性。即任一字典的条目在字典中不能出现超过一次。
       </p>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_294380">
         <table border="0" cellpadding="0" cellspacing="0">
          <tbody>
           <tr>
            <td class="gutter">
             <div class="line number1 index0 alt2">
              1
             </div>
             <div class="line number2 index1 alt1">
              2
             </div>
             <div class="line number3 index2 alt2">
              3
             </div>
             <div class="line number4 index3 alt1">
              4
             </div>
             <div class="line number5 index4 alt2">
              5
             </div>
            </td>
            <td class="code">
             <div class="container">
              <div class="line number1 index0 alt2">
               <code class="python keyword">
                def
               </code>
               <code class="python plain">
                collect_unique_terms(corpus):
               </code>
              </div>
              <div class="line number2 index1 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                unique_vocabulary
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                {}
               </code>
              </div>
              <div class="line number3 index2 alt2">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                for
               </code>
               <code class="python plain">
                term
               </code>
               <code class="python keyword">
                in
               </code>
               <code class="python plain">
                corpus:
               </code>
              </div>
              <div class="line number4 index3 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                unique_vocabulary[term]
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python value">
                1
               </code>
               <code class="python plain">
                ;
               </code>
              </div>
              <div class="line number5 index4 alt2">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                return
               </code>
               <code class="python plain">
                unique_vocabulary;
               </code>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
        这种方法可以让我们对我们的数据有所认知。思考我们使用词干提取及词形还原来考察ABC语料库后的如下输出。
       </p>
       <p>
        首先是原始语料文本：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a3c268304224" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -vv -abc -s -vs
Loading the ABC corpus.
Read 766811 words: [u'PM', u'denies', u'knowledge', u'of', u'AWB', u'kickbacks', u'The', u'Prime', u'Minister', u'has', u'denied', u'he', u'knew', u'AWB', u'was', u'paying', u'kickbacks', u'to', u'Iraq', u'despite']
The corpus contains 766811 elements after processing
The corpus has a total vocabulary of 31885 unique tokens.
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a3c268304224-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a3c268304224-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a3c268304224-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a3c268304224-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a3c268304224-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a3c268304224-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                vv
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                abc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                s
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-e">
                vs
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a3c268304224-2">
               <span class="crayon-e">
                Loading
               </span>
               <span class="crayon-e">
                the
               </span>
               <span class="crayon-e">
                ABC
               </span>
               <span class="crayon-v">
                corpus
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a3c268304224-3">
               <span class="crayon-i">
                Read
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'PM'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denies'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knowledge'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'The'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Prime'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Minister'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'has'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denied'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'he'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knew'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'was'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'paying'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'to'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Iraq'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'despite'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a3c268304224-4">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-e">
                processing
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a3c268304224-5">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                has
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                a
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                total
               </span>
               <span class="crayon-e">
                vocabulary
               </span>
               <span class="crayon-i">
                of
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                31885
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                unique
               </span>
               <span class="crayon-v">
                tokens
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0150 seconds] -->
       <p>
        其次是词形还原后的语料库：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a49477778594" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -vv -abc -l -vs
Loading the ABC corpus.
Read 766811 words: [u'PM', u'denies', u'knowledge', u'of', u'AWB', u'kickbacks', u'The', u'Prime', u'Minister', u'has', u'denied', u'he', u'knew', u'AWB', u'was', u'paying', u'kickbacks', u'to', u'Iraq', u'despite']
The corpus contains 766811 elements after processing
The corpus has a total vocabulary of 28699 unique tokens.
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a49477778594-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a49477778594-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a49477778594-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a49477778594-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a49477778594-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a49477778594-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                vv
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                abc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                l
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-e">
                vs
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a49477778594-2">
               <span class="crayon-e">
                Loading
               </span>
               <span class="crayon-e">
                the
               </span>
               <span class="crayon-e">
                ABC
               </span>
               <span class="crayon-v">
                corpus
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a49477778594-3">
               <span class="crayon-i">
                Read
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'PM'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denies'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knowledge'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'The'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Prime'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Minister'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'has'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denied'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'he'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knew'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'was'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'paying'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'to'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Iraq'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'despite'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a49477778594-4">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-e">
                processing
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a49477778594-5">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                has
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                a
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                total
               </span>
               <span class="crayon-e">
                vocabulary
               </span>
               <span class="crayon-i">
                of
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                28699
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                unique
               </span>
               <span class="crayon-v">
                tokens
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0115 seconds] -->
       <p>
        最后，是词干提取后的语料库：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a50319157265" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -vv -abc -vs
Loading the ABC corpus.
Read 766811 words: [u'PM', u'denies', u'knowledge', u'of', u'AWB', u'kickbacks', u'The', u'Prime', u'Minister', u'has', u'denied', u'he', u'knew', u'AWB', u'was', u'paying', u'kickbacks', u'to', u'Iraq', u'despite']
The corpus contains 766811 elements after processing
The corpus has a total vocabulary of 22162 unique tokens.
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a50319157265-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a50319157265-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a50319157265-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a50319157265-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a50319157265-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a50319157265-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                vv
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                abc
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-e">
                vs
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a50319157265-2">
               <span class="crayon-e">
                Loading
               </span>
               <span class="crayon-e">
                the
               </span>
               <span class="crayon-e">
                ABC
               </span>
               <span class="crayon-v">
                corpus
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a50319157265-3">
               <span class="crayon-i">
                Read
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'PM'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denies'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knowledge'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'The'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Prime'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Minister'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'has'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'denied'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'he'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'knew'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'AWB'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'was'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'paying'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'kickbacks'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'to'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Iraq'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'despite'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a50319157265-4">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                766811
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-e">
                processing
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a50319157265-5">
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                has
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                a
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                total
               </span>
               <span class="crayon-e">
                vocabulary
               </span>
               <span class="crayon-i">
                of
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                22162
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                unique
               </span>
               <span class="crayon-v">
                tokens
               </span>
               <span class="crayon-sy">
                .
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0199 seconds] -->
       <p>
        可以看到，从原始数据到词形还原到词干提取后，语料库中唯一字计数值总体在减少，从31K至28K到22K。这个模式重复于每个语料库。在每个实例中，原始语料库的字数统计大于词干提取后的，而词干提取后的字数统计则大于词形还原后的。
       </p>
       <p>
        <img src="http://dataunion.org/wp-content/uploads/2015/05/6941baebgw1ervkgl7b9mj20m80gota9.jpg"/>
       </p>
       <p>
        上面的图表是使用我们共享工程的Python代码生成。它对非定制语料库列表进行遍历，并分别计算原始、词干提取后、词形还原后的唯一字数量。你可以用命令行重现这个图表。你还可以得到一份同样内容的文本转储。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a57233431355" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py  -v --stemVsLemma
2015-02-02 19:49:22,255 (INFO): Corpora: ['ABC', 'Genesis', 'Gutenberg', 'Inaugural', 'Union', 'Web']
2015-02-02 19:49:22,255 (INFO): Word Counts: [31885, 25841, 51156, 9754, 14591, 21538]
2015-02-02 19:49:22,255 (INFO): Lemmatized Word Counts: [28699, 25444, 46456, 8763, 13111, 20056]
2015-02-02 19:49:22,255 (INFO): Stemmed Word Counts: [22162, 23542, 33521, 6135, 9533, 16599] 
2015-02-02 19:49:22,466 (INFO): The corpus contains 0 elements after processing
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a57233431355-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a57233431355-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a57233431355-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a57233431355-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a57233431355-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a57233431355-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a57233431355-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-i">
                stemVsLemma
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a57233431355-2">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                19
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                22
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                255
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Corpora
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'ABC'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Genesis'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Gutenberg'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Inaugural'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Union'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Web'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a57233431355-3">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                19
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                22
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                255
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-t">
                Word
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Counts
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                31885
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                25841
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                51156
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                9754
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                14591
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                21538
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a57233431355-4">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                19
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                22
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                255
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Lemmatized
               </span>
               <span class="crayon-t">
                Word
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Counts
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                28699
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                25444
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                46456
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                8763
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                13111
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20056
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a57233431355-5">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                19
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                22
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                255
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Stemmed
               </span>
               <span class="crayon-t">
                Word
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                Counts
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                22162
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                23542
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                33521
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                6135
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                9533
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                16599
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a57233431355-6">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                19
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                22
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                466
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-v">
                processing
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0110 seconds] -->
       <p>
       </p>
       <h5>
        词项存在
       </h5>
       <p>
        加入一点复杂性，我们接下来看看如何输出上文所指出的词项。我们生成一个CSV文件，它包含了出现在语料中的每个唯一字。它使用了上文中collect_unique_terms这个方法，只是不同于仅仅简单地输出唯一字计数，它通过遍历返回的字典会打印出每个键值。
       </p>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_302141">
         <table border="0" cellpadding="0" cellspacing="0">
          <tbody>
           <tr>
            <td class="gutter">
             <div class="line number1 index0 alt2">
              1
             </div>
             <div class="line number2 index1 alt1">
              2
             </div>
             <div class="line number3 index2 alt2">
              3
             </div>
             <div class="line number4 index3 alt1">
              4
             </div>
             <div class="line number5 index4 alt2">
              5
             </div>
             <div class="line number6 index5 alt1">
              6
             </div>
             <div class="line number7 index6 alt2">
              7
             </div>
            </td>
            <td class="code">
             <div class="container">
              <div class="line number1 index0 alt2">
               <code class="python keyword">
                def
               </code>
               <code class="python plain">
                output_corpus_terms(corpus, unique_vocabulary
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python color1">
                None
               </code>
               <code class="python plain">
                ):
               </code>
              </div>
              <div class="line number2 index1 alt1">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                if
               </code>
               <code class="python plain">
                unique_vocabulary
               </code>
               <code class="python keyword">
                is
               </code>
               <code class="python color1">
                None
               </code>
               <code class="python plain">
                :
               </code>
              </div>
              <div class="line number3 index2 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                unique_vocabulary
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                collect_unique_terms(corpus)
               </code>
              </div>
              <div class="line number4 index3 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                output_csv_file
               </code>
               <code class="python keyword">
                =
               </code>
               <code class="python plain">
                open_csv_file(
               </code>
               <code class="python string">
                "corpus_terms.csv"
               </code>
               <code class="python plain">
                , [
               </code>
               <code class="python string">
                "Term"
               </code>
               <code class="python plain">
                ])
               </code>
              </div>
              <div class="line number5 index4 alt2">
               <code class="python spaces">
               </code>
               <code class="python keyword">
                for
               </code>
               <code class="python plain">
                term
               </code>
               <code class="python keyword">
                in
               </code>
               <code class="python plain">
                unique_vocabulary:
               </code>
              </div>
              <div class="line number6 index5 alt1">
               <code class="python spaces">
               </code>
               <code class="python plain">
                logging.debug(term)
               </code>
              </div>
              <div class="line number7 index6 alt2">
               <code class="python spaces">
               </code>
               <code class="python plain">
                output_csv_file.writerow([term])
               </code>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
        虽然在仅仅输出一个单词的情况下，它可能看起来意义非常有限，但也有比统计每个词的总计数更好的算法（我们接下来将要将讨论）。正如对微博(tweets)的情感分析一样——在处理较短的文字序列时，我们更倾向于选择它。
       </p>
       <p>
        您可以在命令行中使用所提供的代码生成CSV。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a61628191919" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          python words.py -v --termPresence --gutenberg
2015-02-02 20:07:49,623 (INFO): The corpus contains 2621613 elements after processing
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a61628191919-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a61628191919-2">
               2
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a61628191919-1">
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-v">
                termPresence
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-i">
                gutenberg
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a61628191919-2">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                07
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                49
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                623
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                2621613
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-v">
                processing
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0020 seconds] -->
       <p>
       </p>
       <h5>
        词频
       </h5>
       <p>
        词频是词项存在的延伸。不是简单地指出一个词的存在，而是在确定词频的时候，我们更关心语料库中的每个词所出现的实例个数。计算这个的代码与确定词项存在的代码是非常相似的。
       </p>
       <blockquote>
        <div class="line number1 index0 alt2">
         <code class="python keyword">
          def
         </code>
         <code class="python plain">
          collect_and_output_corpus_term_frequencies(corpus, corpus_name):
         </code>
        </div>
        <div class="line number2 index1 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          term_frequencies
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          collect_term_counts(corpus)
         </code>
        </div>
        <div class="line number3 index2 alt2">
        </div>
        <div class="line number4 index3 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          output_csv_file
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          open_csv_file(
         </code>
         <code class="python string">
          "term_frequencies.csv"
         </code>
         <code class="python plain">
          , [
         </code>
         <code class="python string">
          "Term"
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          "Frequency"
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number5 index4 alt2">
        </div>
        <div class="line number6 index5 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          unsorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          [[key,value]
         </code>
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          key, value
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python plain">
          term_frequencies.iteritems()]
         </code>
        </div>
        <div class="line number7 index6 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          sorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python functions">
          sorted
         </code>
         <code class="python plain">
          (unsorted_array, key
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python keyword">
          lambda
         </code>
         <code class="python plain">
          term_frequency: term_frequency[
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ], reverse
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python color1">
          True
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number8 index7 alt1">
        </div>
        <div class="line number9 index8 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          term, frequency
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python plain">
          sorted_array:
         </code>
        </div>
        <div class="line number10 index9 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          output_csv_file.writerow([term]
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          [frequency])
         </code>
        </div>
        <div class="line number11 index10 alt2">
        </div>
        <div class="line number12 index11 alt1">
         <code class="python spaces">
         </code>
         <code class="python comments">
          # output a bar chart illustrating the above
         </code>
        </div>
        <div class="line number13 index12 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          chart_term_frequencies(
         </code>
         <code class="python string">
          "term_frequencies.png"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number14 index13 alt1">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Term Frequencies ("
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          corpus_name
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python string">
          ")"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number15 index14 alt2">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Term Frequencies"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number16 index15 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          sorted_array, [
         </code>
         <code class="python value">
          0
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python value">
          2
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          3
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          2
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number17 index16 alt2">
        </div>
        <div class="line number18 index17 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          return
         </code>
         <code class="python plain">
          term_frequencies
         </code>
        </div>
       </blockquote>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_707935">
        </div>
       </div>
       <p>
        词频，或者我们将在下一节中看到的基于它的变形，是机器学习的矢量化过程中常见的主要组成部分。一般来说，ML算法需要一组能代表需要判定的单个样本的特征集。但是，文本并不能自动地适应这种模式。要迫使它去适应，我们不能考虑文本本身，而是要看文字实例的数量。词频是将文本域映射到ML友好的实数域一个简单的方法。
       </p>
       <p>
        <img src="http://dataunion.org/wp-content/uploads/2015/05/6941baebgw1ervkglmoekj20m80gojsc.jpg"/>
       </p>
       <p>
        您可以在命令行中使用提供的源代码生成包含所有词条和其频率的有序列表的CSV文件以及上面的图表。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a70541903419" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -v --termFrequency --genesis
2015-02-02 20:27:45,298 (INFO): The corpus contains 315268 elements after processing
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a70541903419-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a70541903419-2">
               2
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a70541903419-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-v">
                termFrequency
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-i">
                genesis
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a70541903419-2">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                27
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                45
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                298
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                315268
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-v">
                processing
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0020 seconds] -->
       <p>
       </p>
       <h5>
        记录标准化词频
       </h5>
       <p>
        该图显示了语料库中三个最常见词条和最不常见词条的原始频率。这里提供的代码可以为用户选择的语料库生成上述图表。此外。运行该脚本还会生成一个名为term_frequencies.csv的文件，它能让用户看到一个包含文档中的所有唯一字及其相应词频的电子表格。使用美国总统就职演说来生成就是：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a7e010439009" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -vv --termFrequency --inaugural	 	 
2015-02-02 20:04:09,345 (DEBUG): Loading the Inaugural Address corpus.	 	 
2015-02-02 20:04:09,464 (DEBUG): Read 145735 words: [u'Fellow', u'-', u'Citizens', u'of', u'the', u'Senate', u'and', u'of', u'the', u'House', u'of', u'Representatives', u':', u'Among', u'the', u'vicissitudes', u'incident', u'to', u'life', u'no']	 	 
2015-02-02 20:04:09,465 (INFO): The corpus contains 145735 elements after processing
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a7e010439009-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a7e010439009-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-576865d969a7e010439009-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a7e010439009-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a7e010439009-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                vv
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-v">
                termFrequency
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-i">
                inaugural
               </span>
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a7e010439009-2">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                04
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                09
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                345
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                DEBUG
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Loading
               </span>
               <span class="crayon-e">
                the
               </span>
               <span class="crayon-e">
                Inaugural
               </span>
               <span class="crayon-e">
                Address
               </span>
               <span class="crayon-v">
                corpus
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line" id="crayon-576865d969a7e010439009-3">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                04
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                09
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                464
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                DEBUG
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                Read
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                145735
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Fellow'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                '-'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Citizens'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'the'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Senate'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'and'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'the'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'House'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'of'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Representatives'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                ':'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'Among'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'the'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'vicissitudes'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'incident'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'to'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'life'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                u
               </span>
               <span class="crayon-s">
                'no'
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a7e010439009-4">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                04
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                09
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                465
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                145735
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-v">
                processing
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0125 seconds] -->
       <p>
        机器学习算法在特征值没有规范到相似的尺度内的时候常常就不工作了。在计算词频的情况下，相对于罕见的单词来说，常用字可能会出现得非常频繁。这将造成这些组完全不同的频率字之间的显著歪斜。即使算法能处理歪斜的特征量， 如果你认为出现率10倍以上的词条就重要10倍以上的话，那么特定的任务可能无法正常工作。收缩特征值大小，同时还允许收缩后的特征值随着原始数据的增长而增长，一种常见的方法是取该特征值的对数。在此实例中，我们使用下面的方程来对词频数据进行归一：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a85632207829" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          LogNormalizedTF = 1 + log10(TermFrequency)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a85632207829-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a85632207829-1">
               <span class="crayon-v">
                LogNormalizedTF
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                log10
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                TermFrequency
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0065 seconds] -->
       <p>
        使用对数以10为底意味着对于每10倍增加的词频，我们将看到对数归一后的词频的一个数量点的增长。我们将对数归一后的词频初始化为1，这样一来，对于词频是0的词来说值刚好也是1。用来计算归一化后的词频的代码非常简单，并依赖于前一节中提到的频率采集器。请注意，此代码同时转储输出到一个CSV文件。上文其他的示例代码没有这一步，因为它们其实是那些最终去转储CSV方法的辅助方法。这段代码恰好是在转储到CSV前做了少量工作（计算对数归一化）。
       </p>
       <blockquote>
        <div class="line number1 index0 alt2">
         <code class="python keyword">
          if
         </code>
         <code class="python plain">
          term_frequencies
         </code>
         <code class="python keyword">
          is
         </code>
         <code class="python color1">
          None
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number2 index1 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          term_frequencies
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          collect_term_counts(corpus)
         </code>
        </div>
        <div class="line number3 index2 alt2">
        </div>
        <div class="line number4 index3 alt1">
         <code class="python plain">
          output_csv_file
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          open_csv_file(
         </code>
         <code class="python string">
          "normalized_term_frequencies.csv"
         </code>
         <code class="python plain">
          , [
         </code>
         <code class="python string">
          "Term"
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          "Log Normalized TF"
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number5 index4 alt2">
        </div>
        <div class="line number6 index5 alt1">
         <code class="python plain">
          unsorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          []
         </code>
        </div>
        <div class="line number7 index6 alt2">
        </div>
        <div class="line number8 index7 alt1">
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          term, frequency
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python plain">
          term_frequencies.iteritems():
         </code>
        </div>
        <div class="line number9 index8 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          normalized_term_frequency
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          (
         </code>
         <code class="python value">
          1
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          math.log(frequency,
         </code>
         <code class="python value">
          10
         </code>
         <code class="python plain">
          ))
         </code>
        </div>
        <div class="line number10 index9 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          unsorted_array.append([term, normalized_term_frequency])
         </code>
        </div>
        <div class="line number11 index10 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          output_csv_file.writerow([term]
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          [normalized_term_frequency])
         </code>
        </div>
        <div class="line number12 index11 alt1">
        </div>
        <div class="line number13 index12 alt2">
         <code class="python plain">
          sorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python functions">
          sorted
         </code>
         <code class="python plain">
          (unsorted_array, key
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python keyword">
          lambda
         </code>
         <code class="python plain">
          term_frequency: term_frequency[
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ], reverse
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python color1">
          True
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number14 index13 alt1">
        </div>
        <div class="line number15 index14 alt2">
         <code class="python comments">
          # output a bar chart illustrating the above
         </code>
        </div>
        <div class="line number16 index15 alt1">
         <code class="python plain">
          chart_term_frequencies(
         </code>
         <code class="python string">
          "normalized_term_frequencies.png"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number17 index16 alt2">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Log Normalized Term Frequencies ("
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          corpus_name
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python string">
          ")"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number18 index17 alt1">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Term Frequencies"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number19 index18 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          sorted_array, [
         </code>
         <code class="python value">
          0
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python value">
          2
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          3
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          2
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python keyword">
          -
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number20 index19 alt1">
        </div>
        <div class="line number21 index20 alt2">
         <code class="python keyword">
          return
         </code>
         <code class="python plain">
          term_frequencies
         </code>
        </div>
       </blockquote>
       <div>
        <div class="syntaxhighlighter notranslate python" id="highlighter_571307">
        </div>
       </div>
       <p>
        下面的图表和上节”词频”中使用的是相同的数据。它显示了就职演说语料库中的三个最常用的词和三个不最常用的词。令人感兴趣的是词频值的压缩。在语料库中出现频率约2000倍以上的词，它的对数归一化版本比文档中只出现一次得分为1（固有地、不公地、疏远地）的词得分只略微高了一点。出现频率超过8000倍的时候，它的分数也只增长了5倍而已。这种压缩用于将文本特征尺寸保持在一个相对小的数值范围内。
       </p>
       <p>
        <img src="http://dataunion.org/wp-content/uploads/2015/05/6941baebgw1ervkgm2t7hj20m80goab7.jpg"/>
       </p>
       <p>
        类似前面的例子，此图也可以直接使用所提供的代码生成。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a92327077224" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py -v --logNormalize --genesis
2015-02-02 20:42:02,538 (INFO): The corpus contains 315268 elements after processing
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a92327077224-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-576865d969a92327077224-2">
               2
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a92327077224-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-v">
                logNormalize
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-i">
                genesis
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-576865d969a92327077224-2">
               <span class="crayon-cn">
                2015
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                42
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                02
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                538
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                INFO
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                The
               </span>
               <span class="crayon-e">
                corpus
               </span>
               <span class="crayon-i">
                contains
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                315268
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                elements
               </span>
               <span class="crayon-e">
                after
               </span>
               <span class="crayon-v">
                processing
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0020 seconds] -->
       <p>
       </p>
       <h5>
        词频频率
       </h5>
       <p>
        不，这不是一个打字错误。词频频率与我们到目前为止讨论的指标稍有不同。这些信息不太可能被直接当作一个ML算法的特征值来用。然而，它可以为检查语料库结构的人提供很多信息。本质上，词频频率对给定的频率的词项进行计数。说再多也比不上直接举一个例子。如果你运行下面的命令行：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-576865d969a9e474545516" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          &amp;gt; python words.py --inaugural -ff
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-576865d969a9e474545516-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-576865d969a9e474545516-1">
               <span class="crayon-o">
                &amp;
               </span>
               <span class="crayon-v">
                gt
               </span>
               <span class="crayon-sy">
                ;
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                python
               </span>
               <span class="crayon-v">
                words
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                py
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                --
               </span>
               <span class="crayon-v">
                inaugural
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                ff
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0070 seconds] -->
       <p>
        它会生成一个名为frequency_frequencies.csv的CSV文件，以及下面的图表。
       </p>
       <p>
        <img src="http://dataunion.org/wp-content/uploads/2015/05/6941baebgw1ervkgog37qj20m80gomy6.jpg"/>
       </p>
       <p>
        可以看到，它计算了一个单词被使用特定（大约4200次）次数的计数（1次）。这是一种奇怪的指标，但它多少可以给你这个语料库是否都是由很少使用的单词所组成的一个大致印象。在这个例子中，整个语料库包含145735个字。例如，让我们认为出现四次或更少次数的单词就是罕见词，别的则是常用词。我们知道，这个语料库中有4122+1488+817+547=6,974个或约占总字数4.7％的罕见词。与此相比，国情咨文语料库中有9581，总字数为399822，或约占总字数2％的罕见词。这似乎暗示了就职演说与国情咨文相比有着更丰富的词汇量。这是有用的信息吗？可能有。取决于你想了解该文本的什么方面。
       </p>
       <p>
        计算词频频率的代码是非常简单的。它和上文利用相同的词频数据。该方法通过遍历”词/词频”字典，并建立一个新的frequency_frequencies字典，累计对应频率的不同词个数。总而言之，我们对出现在每个频率的词条数进行计数。
       </p>
       <blockquote>
        <div class="line number1 index0 alt2">
         <code class="python keyword">
          def
         </code>
         <code class="python plain">
          collect_and_output_frequency_frequencies(corpus, corpus_name, term_frequencies):
         </code>
        </div>
        <div class="line number2 index1 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          if
         </code>
         <code class="python plain">
          term_frequencies
         </code>
         <code class="python keyword">
          is
         </code>
         <code class="python color1">
          None
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number3 index2 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          term_frequencies
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          collect_term_counts(corpus)
         </code>
        </div>
        <div class="line number4 index3 alt1">
        </div>
        <div class="line number5 index4 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequency_frequencies
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          {}
         </code>
        </div>
        <div class="line number6 index5 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          term, frequency
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python plain">
          term_frequencies.iteritems():
         </code>
        </div>
        <div class="line number7 index6 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          if
         </code>
         <code class="python plain">
          frequency_frequencies.has_key(frequency):
         </code>
        </div>
        <div class="line number8 index7 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequency_frequencies[frequency]
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python value">
          1
         </code>
        </div>
        <div class="line number9 index8 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          else
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number10 index9 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequency_frequencies[frequency]
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python value">
          1
         </code>
        </div>
        <div class="line number11 index10 alt2">
        </div>
        <div class="line number12 index11 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          unsorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          [[key,value]
         </code>
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          key, value
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python plain">
          frequency_frequencies.iteritems()]
         </code>
        </div>
        <div class="line number13 index12 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          sorted_array
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python functions">
          sorted
         </code>
         <code class="python plain">
          (unsorted_array, key
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python keyword">
          lambda
         </code>
         <code class="python plain">
          frequency_frequency: frequency_frequency[
         </code>
         <code class="python value">
          1
         </code>
         <code class="python plain">
          ], reverse
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python color1">
          True
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number14 index13 alt1">
        </div>
        <div class="line number15 index14 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequency_frequencies_to_chart
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          []
         </code>
        </div>
        <div class="line number16 index15 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequencies_to_chart
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          []
         </code>
        </div>
        <div class="line number17 index16 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          output_csv_file
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python plain">
          open_csv_file(
         </code>
         <code class="python string">
          "frequency_frequencies.csv"
         </code>
         <code class="python plain">
          , [
         </code>
         <code class="python string">
          "Frequency Frequency"
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          "Term Frequency"
         </code>
         <code class="python plain">
          ])
         </code>
        </div>
        <div class="line number18 index17 alt1">
        </div>
        <div class="line number19 index18 alt2">
         <code class="python spaces">
         </code>
         <code class="python comments">
          # we collect frequencies_to_chart and frequency_frequencies_to_chart each into their own single dimensional
         </code>
        </div>
        <div class="line number20 index19 alt1">
         <code class="python spaces">
         </code>
         <code class="python comments">
          # array.  Then we pass frequency_frequencies_to_chart in an array so that it is 2D as needed by the chart.
         </code>
        </div>
        <div class="line number21 index20 alt2">
         <code class="python spaces">
         </code>
         <code class="python comments">
          # This means there is exactly 1 data set and 6 columns of data in the set.  There is no second set to compare
         </code>
        </div>
        <div class="line number22 index21 alt1">
         <code class="python spaces">
         </code>
         <code class="python comments">
          # it to.
         </code>
        </div>
        <div class="line number23 index22 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          for
         </code>
         <code class="python plain">
          index, (term_frequency, frequency_frequency)
         </code>
         <code class="python keyword">
          in
         </code>
         <code class="python functions">
          enumerate
         </code>
         <code class="python plain">
          (sorted_array):
         </code>
        </div>
        <div class="line number24 index23 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          output_csv_file.writerow([frequency_frequency]
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          [term_frequency])
         </code>
        </div>
        <div class="line number25 index24 alt2">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          if
         </code>
         <code class="python plain">
          index &lt;
         </code>
         <code class="python keyword">
          =
         </code>
         <code class="python value">
          20
         </code>
         <code class="python plain">
          :
         </code>
        </div>
        <div class="line number26 index25 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequencies_to_chart.extend([term_frequency])
         </code>
        </div>
        <div class="line number27 index26 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequency_frequencies_to_chart.extend([frequency_frequency])
         </code>
        </div>
        <div class="line number28 index27 alt1">
        </div>
        <div class="line number29 index28 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          charting.bar_chart(
         </code>
         <code class="python string">
          "frequency_frequencies.png"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number30 index29 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          [frequency_frequencies_to_chart],
         </code>
        </div>
        <div class="line number31 index30 alt2">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Frequency Frequencies ("
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python plain">
          corpus_name
         </code>
         <code class="python keyword">
          +
         </code>
         <code class="python string">
          ")"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number32 index31 alt1">
         <code class="python spaces">
         </code>
         <code class="python plain">
          frequencies_to_chart,
         </code>
        </div>
        <div class="line number33 index32 alt2">
         <code class="python spaces">
         </code>
         <code class="python string">
          "Frequency Frequency"
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number34 index33 alt1">
         <code class="python spaces">
         </code>
         <code class="python color1">
          None
         </code>
         <code class="python plain">
          ,
         </code>
        </div>
        <div class="line number35 index34 alt2">
         <code class="python spaces">
         </code>
         <code class="python plain">
          [
         </code>
         <code class="python string">
          '#59799e'
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          '#810CE8'
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          '#FF0000'
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          '#12995D'
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          '#FD53FF'
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python string">
          '#AA55CC'
         </code>
         <code class="python plain">
          ],
         </code>
        </div>
        <div class="line number36 index35 alt1">
         <code class="python spaces">
         </code>
         <code class="python value">
          0.2
         </code>
         <code class="python plain">
          ,
         </code>
         <code class="python value">
          0.0
         </code>
         <code class="python plain">
          )
         </code>
        </div>
        <div class="line number37 index36 alt2">
        </div>
        <div class="line number38 index37 alt1">
         <code class="python spaces">
         </code>
         <code class="python keyword">
          return
         </code>
         <code class="python plain">
          frequency_frequencies
         </code>
        </div>
       </blockquote>
       <div>
       </div>
       <h5>
        总结
       </h5>
       <p>
        在这一课中，我们研究了一些基本指标和文本分析的一些基础模块。我们没有做任何的机器学习相关的事情。别担心，ML代码的干货即将到来。在我们进行到那里之前，我们要先了解基础的概率论和一些简单的语言建模技术。将本课程、概率论和一些语言建模技术的结合起来，能带领我们接触第一个真正的机器学习任务——朴素贝叶斯分类器。从那里开始，我们将接触一系列不同的ML相关的主题，所以暂时别离开，我们保证讨论ML代码前的主题也能引起你的兴趣。
       </p>
       <p>
        最后，我们希望这个系列是可延展且有见地的。如果我们发现更多更加清晰的材料或新的有用的例子，我们将把它们添加进来。如果你觉得有什么是值得添加的，也请留言给我们。同样重要的是，如果读者发现任何我们弄错的地方，不管是代码或以其他方面，不要犹豫，马上告知我们。我们对此表示衷心的感谢。
       </p>
       <p>
        英文出处：
        <a href="http://www.thoughtly.co/blog/working-with-text/" target="_blank">
         www.thoughtly.com
        </a>
       </p>
      </div>
      <div>
       <strong>
        注：转载文章均来自于公开网络，仅供学习使用，不会用于任何商业用途，如果侵犯到原作者的权益，请您与我们联系删除或者授权事宜，联系邮箱：contact@dataunion.org。转载数盟网站文章请注明原文章作者，否则产生的任何版权纠纷与数盟无关。
       </strong>
      </div>
      <!--content_text-->
      <div class="fenxian">
       <!-- JiaThis Button BEGIN -->
       <div class="jiathis_style_32x32">
        <p class="jiathis_button_weixin">
        </p>
        <p class="jiathis_button_tsina">
        </p>
        <p class="jiathis_button_qzone">
        </p>
        <p class="jiathis_button_cqq">
        </p>
        <p class="jiathis_button_tumblr">
        </p>
        <a class="jiathis jiathis_txt jtico jtico_jiathis" href="http://www.jiathis.com/share" target="_blank">
        </a>
        <p class="jiathis_counter_style">
        </p>
       </div>
       <!-- JiaThis Button END -->
      </div>
     </article>
     <!--content-->
     <!--相关文章-->
     <div class="xianguan">
      <div class="xianguantitle">
       相关文章！
      </div>
      <ul class="pic">
       <li>
        <a href="http://dataunion.org/24220.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/05/100910rrxp5plglxk4lcle-291x200.png"/>
        </a>
        <a class="link" href="http://dataunion.org/24220.html" rel="bookmark" title="520大数据：八成男性期待被表白 “套路”也是爱">
         520大数据：八成男性期待被表白 “套路”也是爱
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/23785.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/04/u7145986293872240263fm21gp0-298x200.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/23785.html" rel="bookmark" title="Docker安全基准：22项新增要点概述">
         Docker安全基准：22项新增要点概述
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/23703.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/04/1317020-300x163.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/23703.html" rel="bookmark" title="推荐：五个不容错过的数据科学家博客">
         推荐：五个不容错过的数据科学家博客
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/23507.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/03/110606wjpugdldig4idddn.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/23507.html" rel="bookmark" title="解密深度学习在智能推荐系统的实践与应用">
         解密深度学习在智能推荐系统的实践与应用
        </a>
       </li>
      </ul>
     </div>
     <!--相关文章-->
     <div class="comment" id="comments">
      <!-- You can start editing here. -->
      <!-- If comments are open, but there are no comments. -->
      <div class="title">
       期待你一针见血的评论，Come on！
      </div>
      <div id="respond">
       <p>
        不用想啦，马上
        <a href="http://dataunion.org/wp-login.php?redirect_to=http%3A%2F%2Fdataunion.org%2F17327.html">
         "登录"
        </a>
        发表自已的想法.
       </p>
      </div>
     </div>
     <!-- .nav-single -->
    </div>
    <!--Container End-->
    <aside id="sitebar">
     <div class="sitebar_list2">
      <div class="wptag">
       <span class="tagtitle">
        热门标签+
       </span>
       <div class="tagg">
        <ul class="menu" id="menu-%e5%8f%8b%e6%83%85%e9%93%be%e6%8e%a5">
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-1605" id="menu-item-1605">
          <a href="http://taidizh.com/">
           泰迪智慧
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20884" id="menu-item-20884">
          <a href="http://www.transwarp.cn/">
           星环科技
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-3538" id="menu-item-3538">
          <a href="http://datall.org/">
           珈和遥感
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20888" id="menu-item-20888">
          <a href="http://www.chinahadoop.cn/">
           小象学院
          </a>
         </li>
        </ul>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <div class="textwidget">
       <div align="center">
        <a href="http://study.163.com/course/courseMain.htm?courseId=991022" target="_blank">
         <img src="http://dataunion.org/wp-content/uploads/2016/03/dv.jpg"/>
        </a>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       文章分类
      </h4>
      <div class="tagcloud">
       <a class="tag-link-44" href="http://dataunion.org/category/industry/demo" style="font-size: 10.204724409449pt;" title="4个话题">
        Demo展示
       </a>
       <a class="tag-link-31" href="http://dataunion.org/category/experts" style="font-size: 15.826771653543pt;" title="52个话题">
        专家团队
       </a>
       <a class="tag-link-870" href="http://dataunion.org/category/tech/ai" style="font-size: 19.795275590551pt;" title="273个话题">
        人工智能
       </a>
       <a class="tag-link-488" href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f" style="font-size: 8pt;" title="1个话题">
        加入数盟
       </a>
       <a class="tag-link-869" href="http://dataunion.org/category/tech/viz" style="font-size: 17.204724409449pt;" title="93个话题">
        可视化
       </a>
       <a class="tag-link-30" href="http://dataunion.org/category/partners" style="font-size: 10.645669291339pt;" title="5个话题">
        合作伙伴
       </a>
       <a class="tag-link-889" href="http://dataunion.org/category/parterc" style="font-size: 11.582677165354pt;" title="8个话题">
        合作会议
       </a>
       <a class="tag-link-104" href="http://dataunion.org/category/books" style="font-size: 12.96062992126pt;" title="15个话题">
        图书
       </a>
       <a class="tag-link-220" href="http://dataunion.org/category/tech/base" style="font-size: 19.850393700787pt;" title="281个话题">
        基础架构
       </a>
       <a class="tag-link-219" href="http://dataunion.org/category/tech/analysis" style="font-size: 19.409448818898pt;" title="232个话题">
        数据分析
       </a>
       <a class="tag-link-887" href="http://dataunion.org/category/tech/dm" style="font-size: 13.291338582677pt;" title="17个话题">
        数据挖掘
       </a>
       <a class="tag-link-34" href="http://dataunion.org/category/tech" style="font-size: 20.732283464567pt;" title="404个话题">
        文章
       </a>
       <a class="tag-link-1" href="http://dataunion.org/category/uncategorized" style="font-size: 22pt;" title="693个话题">
        未分类
       </a>
       <a class="tag-link-4" href="http://dataunion.org/category/events" style="font-size: 14.503937007874pt;" title="29个话题">
        活动
       </a>
       <a class="tag-link-890" href="http://dataunion.org/category/tech/%e6%b7%b1%e5%ba%a6%e5%ad%a6%e4%b9%a0" style="font-size: 10.204724409449pt;" title="4个话题">
        深度学习
       </a>
       <a class="tag-link-221" href="http://dataunion.org/category/tech/devl" style="font-size: 18.968503937008pt;" title="193个话题">
        编程语言
       </a>
       <a class="tag-link-888" href="http://dataunion.org/category/career" style="font-size: 15.661417322835pt;" title="48个话题">
        职业规划
       </a>
       <a class="tag-link-5" href="http://dataunion.org/category/jobs" style="font-size: 14.11811023622pt;" title="25个话题">
        职位
       </a>
       <a class="tag-link-871" href="http://dataunion.org/category/industry" style="font-size: 15.716535433071pt;" title="49个话题">
        行业
       </a>
       <a class="tag-link-613" href="http://dataunion.org/category/industry/case" style="font-size: 16.984251968504pt;" title="84个话题">
        行业应用
       </a>
       <a class="tag-link-885" href="http://dataunion.org/category/industry/news" style="font-size: 17.425196850394pt;" title="102个话题">
        行业资讯
       </a>
       <a class="tag-link-10" href="http://dataunion.org/category/training" style="font-size: 14.228346456693pt;" title="26个话题">
        课程
       </a>
       <a class="tag-link-16" href="http://dataunion.org/category/sources" style="font-size: 15.661417322835pt;" title="48个话题">
        资源
       </a>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       功能
      </h4>
      <ul>
       <li>
        <a href="http://dataunion.org/wp-login.php?action=register">
         注册
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/wp-login.php">
         登录
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/feed">
         文章
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/comments/feed">
         评论
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="https://cn.wordpress.org/" title="基于WordPress，一个优美、先进的个人信息发布平台。">
         WordPress.org
        </a>
       </li>
      </ul>
     </div>
    </aside>
    <div class="clear">
    </div>
   </div>
   <!--main-->
   ﻿
   <footer id="dibu">
    <div class="about">
     <div class="right">
      <ul class="menu" id="menu-%e5%ba%95%e9%83%a8%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-18024" id="menu-item-18024">
        <a href="http://dataunion.org/category/partners">
         合作伙伴
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20881" id="menu-item-20881">
        <a href="http://dataunion.org/contribute">
         文章投稿
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20872" id="menu-item-20872">
        <a href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f">
         加入数盟
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22441" id="menu-item-22441">
        <a href="http://dataunion.org/f-links">
         友情链接
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20874" id="menu-item-20874">
        <a href="http://dataunion.org/aboutus">
         关于数盟
        </a>
       </li>
      </ul>
      <p class="banquan">
       数盟社区        ，
        做最棒的数据科学社区
      </p>
     </div>
     <div class="left">
      <ul class="bottomlist">
       <li>
        <a href="http://weibo.com/DataScientistUnion  " target="_blank" 　title="">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weibo.png"/>
        </a>
       </li>
       <li>
        <a class="cd-popup-trigger" href="http://dataunion.org/17327.html#0">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weixin.png"/>
        </a>
       </li>
      </ul>
      <div class="cd-popup">
       <div class="cd-popup-container">
        <h1>
         扫描二维码,加微信公众号
        </h1>
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/2014-12-06-1515289049.png"/>
        <a class="cd-popup-close" href="http://dataunion.org/17327.html">
        </a>
       </div>
       <!-- cd-popup-container -->
      </div>
      <!-- cd-popup -->
     </div>
    </div>
    <!--about-->
    <div class="bottom">
     <a href="http://dataunion.org/">
      数盟社区
     </a>
     <a href="http://www.miitbeian.gov.cn/" rel="external nofollow" target="_blank">
      京ICP备14026740号
     </a>
     联系我们：
     <a href="mailto:contact@dataunion.org" target="_blank">
      contact@dataunion.org
     </a>
     <div class="tongji">
     </div>
     <!--bottom-->
     <div class="scroll" id="scroll" style="display:none;">
      ︿
     </div>
    </div>
   </footer>
   <!--dibu-->
  </div>
 </body>
</html>